In [1]:
    
import pandas
from collections import defaultdict
    
In [2]:
    
pah = pandas.read_csv('enwiki_pah_misalignment.tsv',sep='\t')
    
In [3]:
    
pah.head()
    
    Out[3]:
In [4]:
    
pah.groupby('dissonance').head()
    
    Out[4]:
In [5]:
    
lines = open('english-qid-names2016-03-27.csv','r').readlines()
    
In [6]:
    
qidnames = {}
for line in lines:
    qid, ennamen = line.split(',', maxsplit=1)
    enname = ennamen.split('\n')[0]
    if enname:
        qidnames[qid]=enname
    
In [7]:
    
import json
    
In [8]:
    
json.dump(qidnames, open('qid_enpage.json','w'))
    
In [9]:
    
endf = pandas.DataFrame.from_dict(qidnames,orient='index')
    
In [10]:
    
len(endf)
    
    Out[10]:
Todo:
+ map gender-enwiki-page-id
+ dissoance class priors
+ posteriors by gender
+ posterior for no gender.
In [11]:
    
bigdf = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv')
    
In [12]:
    
gender_qid_df = bigdf[['qid','gender']]
    
In [13]:
    
def map_gender(x):
    if isinstance(x,float):
        return 'no gender'
    else:
        gen = x.split('|')[0]
        if gen == 'Q6581072':
            return 'female'
        elif gen == 'Q6581097':
            return 'male'
        else:
            return 'nonbin'
gender_qid_df['gender'] = gender_qid_df['gender'].apply(map_gender)
    
    
In [14]:
    
def qid2enname(x):
    try:
        return qidnames[x]
    except KeyError:
        return None
gender_qid_df['enname'] = gender_qid_df['qid'].apply(qid2enname)
    
    
In [15]:
    
enname_id = pandas.read_csv('/home/notconfusing/workspace/wikidumpparse/wikidump/mediawiki-utilities/enname_id.txt',sep='\t',names=['enname','pageid'])
    
In [16]:
    
gender_page_id = pandas.merge(gender_qid_df, enname_id, how='inner',on='enname')
    
In [17]:
    
pah_gender = pandas.merge(pah, gender_page_id, how='left', on='pageid')
    
In [18]:
    
pah_gender
    
    Out[18]:
In [19]:
    
len(pah), len(gender_page_id), len(pah_gender)
    
    Out[19]:
Rel risk. P(gender|misaligned)/P(gender)
What proportion of the misaligned dataset is about women?
For each gender, what proportion of the each misalignment group do the represent.
In [20]:
    
pah_gender['gender'] = pah_gender['gender'].fillna('nonbio')
    
In [68]:
    
SE = pah_gender[(pah_gender['dissonance'] == 'Moderate negative') | (pah_gender['dissonance'] == 'High negative')]
NI = pah_gender[(pah_gender['dissonance'] == 'Moderate positive') | (pah_gender['dissonance'] == 'High positive')]
rel_risk = defaultdict(dict)
for risk, risk_name in [(SE,'Spent Effort'), (NI,'Needs Improvement')]:
    for gender in ['female','male','nonbin','nonbio']:
        gen_mis = len(risk[risk['gender'] == gender])
        p_gen_mis = gen_mis/len(risk)               #p(gender|misalignment)
        p_gen = len(pah_gender[pah_gender['gender'] == gender]) / len(pah_gender)   #p(gender)
        print(p_gen_mis, p_gen)
        rel_risk[gender][risk_name] = p_gen_mis/p_gen#rel sirk
    
    
In [24]:
    
java_min_int = -2147483648
allrecs = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv',na_values=[java_min_int])
    
In [26]:
    
def sum_column(q_str):
    if type(q_str) is str:
        qs = q_str.split('|')
        return len(qs) #cos the format will always end with a |
for col in ['site_links']:
    allrecs[col] = allrecs[col].apply(sum_column)
    
In [27]:
    
allrecs['site_links'].head(20)
    
    Out[27]:
In [29]:
    
allrecs['gender'] = allrecs['gender'].apply(map_gender)
    
In [78]:
    
sl_risk = defaultdict(dict)
sl_risk['nonbio']['Sitelink Ratio'] = 1
for gender in ['female','male','nonbin']:
    gend_df = allrecs[allrecs['gender']==gender]
    gend_df_size = len(gend_df)
    avg_sl = (gend_df['site_links'].sum() / gend_df_size)  / 2.6
    sl_risk[gender]['Sitelink Ratio'] = avg_sl
    
In [79]:
    
sl_risk_df = pandas.DataFrame.from_dict(sl_risk, orient='index')
    
In [80]:
    
rel_risk_df = pandas.DataFrame.from_dict(rel_risk,orient="index")
    
In [81]:
    
risk_df = pandas.DataFrame.join(sl_risk_df,rel_risk_df)
    
In [82]:
    
risk_df.index = ['Female','Male','Non-binary','Non-biography']
    
In [83]:
    
print(risk_df.to_latex(columns = ['Needs Improvement','Spent Effort', 'Sitelink Ratio'],float_format=lambda n:'%.2f' %n))